Chapter 6.1.1 - One-hot encoding of words and characters



In [1]:

    
import numpy as np

Initializing an example



In [2]:

    
samples = ['The cat sat on the mat.', 'The dog ate my homework.']



In [3]:

    
# Intializing token index as an empty dictionary
token_index = {}

Splitting the sentence (word level)



In [4]:

    
# Testing result of function .split()
samples[0].split()









    Out[4]:





['The', 'cat', 'sat', 'on', 'the', 'mat.']



In [5]:

    
for sample in samples:
    # Each sample is split into words. 
    # Currently, punctuation is not ommited
    for word in sample.split():
        if word not in token_index:
            # Assign a unique index to each unique word
            # Index 0 is not assigned to anything.
            token_index[word] = len(token_index) + 1



In [6]:

    
token_index









    Out[6]:





{'The': 1,
 'ate': 8,
 'cat': 2,
 'dog': 7,
 'homework.': 10,
 'mat.': 6,
 'my': 9,
 'on': 4,
 'sat': 3,
 'the': 5}

Vectoring the example



In [7]:

    
# Taking into consideration only first 10 words in each sentence.
max_length = 10



In [8]:

    
# Initializing the result array with zeros
# It will be of shape (number_of_samples, max_length_taken_into_consideration, number_of_unique_words)
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))



In [9]:

    
# Enumerating through samples and words
# One-hot encoding
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.



In [10]:

    
results









    Out[10]:





array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

Splitting the sentence (character level)



In [11]:

    
import string



In [12]:

    
samples = ['The cat sat on the mat.', 'The dog ate my homework.']



In [13]:

    
# Assigning all prinatable ASCII characters
characters = string.printable



In [14]:

    
characters









    Out[14]:





'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'



In [15]:

    
# Tokenizing the characters
token_index = dict(zip(characters, range(1, len(characters) + 1)))



In [16]:

    
token_index









    Out[16]:





{'\t': 96,
 '\n': 97,
 '\x0b': 99,
 '\x0c': 100,
 '\r': 98,
 ' ': 95,
 '!': 63,
 '"': 64,
 '#': 65,
 '$': 66,
 '%': 67,
 '&': 68,
 "'": 69,
 '(': 70,
 ')': 71,
 '*': 72,
 '+': 73,
 ',': 74,
 '-': 75,
 '.': 76,
 '/': 77,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 ':': 78,
 ';': 79,
 '<': 80,
 '=': 81,
 '>': 82,
 '?': 83,
 '@': 84,
 'A': 37,
 'B': 38,
 'C': 39,
 'D': 40,
 'E': 41,
 'F': 42,
 'G': 43,
 'H': 44,
 'I': 45,
 'J': 46,
 'K': 47,
 'L': 48,
 'M': 49,
 'N': 50,
 'O': 51,
 'P': 52,
 'Q': 53,
 'R': 54,
 'S': 55,
 'T': 56,
 'U': 57,
 'V': 58,
 'W': 59,
 'X': 60,
 'Y': 61,
 'Z': 62,
 '[': 85,
 '\\': 86,
 ']': 87,
 '^': 88,
 '_': 89,
 '`': 90,
 'a': 11,
 'b': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'f': 16,
 'g': 17,
 'h': 18,
 'i': 19,
 'j': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'o': 25,
 'p': 26,
 'q': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'x': 34,
 'y': 35,
 'z': 36,
 '{': 91,
 '|': 92,
 '}': 93,
 '~': 94}



In [17]:

    
# Take intro consideration only first 50 character of the sentence
max_length = 50



In [18]:

    
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))



In [19]:

    
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.



In [20]:

    
results









    Out[20]:





array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

Keras Tokenizer



In [21]:

    
# Importing Keras Tokenizer
from keras.preprocessing.text import Tokenizer









    



C:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.



In [22]:

    
samples = ['The cat sat on the mat.', 'The dog ate my homework.']



In [23]:

    
# Intializing tokenizer, which will take into account only 1000 most commonly used words.
tokenizer = Tokenizer(num_words = 1000)



In [24]:

    
# Building the dictionary
tokenizer.fit_on_texts(samples)



In [25]:

    
# Turning the sequences to an array of integers corresponding to the unique words
sequences = tokenizer.texts_to_sequences(samples)



In [26]:

    
sequences









    Out[26]:





[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]



In [27]:

    
# Representing the data as one-hot encoded.
one_hot_results = tokenizer.texts_to_matrix(samples, 
                                            mode = 'binary')



In [28]:

    
one_hot_results









    Out[28]:





array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])



In [29]:

    
# Retrieving the index
word_index = tokenizer.word_index



In [30]:

    
word_index









    Out[30]:





{'ate': 7,
 'cat': 2,
 'dog': 6,
 'homework': 9,
 'mat': 5,
 'my': 8,
 'on': 4,
 'sat': 3,
 'the': 1}

One-hot hashing

"A variant of one-hot encoding is the so-called "one-hot hashing trick", which can be used when the number of unique tokens in your vocabulary is too large to handle explicitly. Instead of explicitly assigning an index to each word and keeping a reference of these indices in a dictionary, one may hash words into vectors of fixed size. This is typically done with a very lightweight hashing function. The main advantage of this method is that it does away with maintaining an explicit word index, which saves memory and allows online encoding of the data (starting to generate token vectors right away, before having seen all of the available data). The one drawback of this method is that it is susceptible to "hash collisions": two different words may end up with the same hash, and subsequently any machine learning model looking at these hashes won't be able to tell the difference between these words. The likelihood of hash collisions decreases when the dimensionality of the hashing space is much larger than the total number of unique tokens being hashed."



In [31]:

    
samples = ['The cat sat on the mat.', 'The dog ate my homework.']



In [32]:

    
# Storing words as vector of size 1000.
# Possible hash collisions and the accuracy of the encoding will drop.

dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))



In [33]:

    
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        # Hash the word into a "random" integer index
        # that is between 0 and 1000
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.



In [34]:

    
results









    Out[34]:





array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])



In [35]:

    
# 2 examples, each of size 10 (or less, but encoding will persist) with one-hot hashed words
results.shape









    Out[35]:





(2, 10, 1000)